import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()
pyplot = py.offline.iplot
import os
os.chdir(r'/Users/xuefeng/Desktop/Project')
# Load data and set CustomerID as string type
data = pd.read_csv('data.csv',
encoding = 'unicode_escape',
dtype = {'CustomerID': str})
data.shape
(541909, 8)
data.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 541909 entries, 0 to 541908 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 541909 non-null object 1 StockCode 541909 non-null object 2 Description 540455 non-null object 3 Quantity 541909 non-null int64 4 InvoiceDate 541909 non-null object 5 UnitPrice 541909 non-null float64 6 CustomerID 406829 non-null object 7 Country 541909 non-null object dtypes: float64(1), int64(1), object(6) memory usage: 33.1+ MB
# Check the returned product(InvoiceNo starts with 'C')
data[data['InvoiceNo'].str[0] == 'C']
# The Quantity of the returned product is negtive value.
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 141 | C536379 | D | Discount | -1 | 12/1/2010 9:41 | 27.50 | 14527 | United Kingdom |
| 154 | C536383 | 35004C | SET OF 3 COLOURED FLYING DUCKS | -1 | 12/1/2010 9:49 | 4.65 | 15311 | United Kingdom |
| 235 | C536391 | 22556 | PLASTERS IN TIN CIRCUS PARADE | -12 | 12/1/2010 10:24 | 1.65 | 17548 | United Kingdom |
| 236 | C536391 | 21984 | PACK OF 12 PINK PAISLEY TISSUES | -24 | 12/1/2010 10:24 | 0.29 | 17548 | United Kingdom |
| 237 | C536391 | 21983 | PACK OF 12 BLUE PAISLEY TISSUES | -24 | 12/1/2010 10:24 | 0.29 | 17548 | United Kingdom |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 540449 | C581490 | 23144 | ZINC T-LIGHT HOLDER STARS SMALL | -11 | 12/9/2011 9:57 | 0.83 | 14397 | United Kingdom |
| 541541 | C581499 | M | Manual | -1 | 12/9/2011 10:28 | 224.69 | 15498 | United Kingdom |
| 541715 | C581568 | 21258 | VICTORIAN SEWING BOX LARGE | -5 | 12/9/2011 11:57 | 10.95 | 15311 | United Kingdom |
| 541716 | C581569 | 84978 | HANGING HEART JAR T-LIGHT HOLDER | -1 | 12/9/2011 11:58 | 1.25 | 17315 | United Kingdom |
| 541717 | C581569 | 20979 | 36 PENCILS TUBE RED RETROSPOT | -5 | 12/9/2011 11:58 | 1.25 | 17315 | United Kingdom |
9288 rows × 8 columns
data.apply(lambda x: sum(x.isnull()) / len(x), axis = 0).sort_values(ascending = False)
CustomerID 0.249267 Description 0.002683 InvoiceNo 0.000000 StockCode 0.000000 Quantity 0.000000 InvoiceDate 0.000000 UnitPrice 0.000000 Country 0.000000 dtype: float64
# Check product category
data['Description'].unique()
array(['WHITE HANGING HEART T-LIGHT HOLDER', 'WHITE METAL LANTERN',
'CREAM CUPID HEARTS COAT HANGER', ..., 'lost',
'CREAM HANGING HEART T-LIGHT HOLDER',
'PAPER CRAFT , LITTLE BIRDIE'], dtype=object)
In the real world, we will need to verify with the related department on the missing values before any action.
data.drop(['Description'], axis = 1, inplace = True)
data['CustomerID'] = data['CustomerID'].fillna('COD')
data['InvoiceDate']
0 12/1/2010 8:26
1 12/1/2010 8:26
2 12/1/2010 8:26
3 12/1/2010 8:26
4 12/1/2010 8:26
...
541904 12/9/2011 12:50
541905 12/9/2011 12:50
541906 12/9/2011 12:50
541907 12/9/2011 12:50
541908 12/9/2011 12:50
Name: InvoiceDate, Length: 541909, dtype: object
data['Date'] = pd.to_datetime(data.InvoiceDate, errors = 'coerce').dt.date
data['Date'].head()
0 2010-12-01 1 2010-12-01 2 2010-12-01 3 2010-12-01 4 2010-12-01 Name: Date, dtype: object
data['Time'] = pd.to_datetime(data.InvoiceDate, errors = 'coerce').dt.time
data['Time'].head()
0 08:26:00 1 08:26:00 2 08:26:00 3 08:26:00 4 08:26:00 Name: Time, dtype: object
data.head()
| InvoiceNo | StockCode | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | Date | Time | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 |
| 1 | 536365 | 71053 | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 |
| 2 | 536365 | 84406B | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 |
| 3 | 536365 | 84029G | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 |
| 4 | 536365 | 84029E | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 |
data.drop(['InvoiceDate'], axis = 1, inplace = True)
data['Year'] = pd.to_datetime(data.Date).dt.year
data['Month'] = pd.to_datetime(data.Date).dt.month
data['Day'] = pd.to_datetime(data.Date).dt.day
data.head()
| InvoiceNo | StockCode | Quantity | UnitPrice | CustomerID | Country | Date | Time | Year | Month | Day | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | 6 | 2.55 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 |
| 1 | 536365 | 71053 | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 |
| 2 | 536365 | 84406B | 8 | 2.75 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 |
| 3 | 536365 | 84029G | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 |
| 4 | 536365 | 84029E | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 |
data['Date'] = pd.to_datetime(data['Date'])
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 541909 entries, 0 to 541908 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 541909 non-null object 1 StockCode 541909 non-null object 2 Quantity 541909 non-null int64 3 UnitPrice 541909 non-null float64 4 CustomerID 541909 non-null object 5 Country 541909 non-null object 6 Date 541909 non-null datetime64[ns] 7 Time 541909 non-null object 8 Year 541909 non-null int64 9 Month 541909 non-null int64 10 Day 541909 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(4), object(5) memory usage: 45.5+ MB
data['Total'] = data.apply(lambda x: x[2] * x[3], axis = 1)
data.head()
| InvoiceNo | StockCode | Quantity | UnitPrice | CustomerID | Country | Date | Time | Year | Month | Day | Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | 6 | 2.55 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 15.30 |
| 1 | 536365 | 71053 | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
| 2 | 536365 | 84406B | 8 | 2.75 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 22.00 |
| 3 | 536365 | 84029G | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
| 4 | 536365 | 84029E | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
# Check the amount of duplicate values
data.duplicated().value_counts()
False 536639 True 5270 dtype: int64
# drop duplicates
data = data.drop_duplicates()
data.duplicated().value_counts()
False 536639 dtype: int64
data.describe()
| Quantity | UnitPrice | Year | Month | Day | Total | |
|---|---|---|---|---|---|---|
| count | 536639.000000 | 536639.000000 | 536639.000000 | 536639.000000 | 536639.000000 | 536639.000000 |
| mean | 9.619500 | 4.632660 | 2010.921771 | 7.544819 | 15.024639 | 18.122900 |
| std | 219.130206 | 97.233299 | 0.268533 | 3.508697 | 8.663358 | 380.656313 |
| min | -80995.000000 | -11062.060000 | 2010.000000 | 1.000000 | 1.000000 | -168469.600000 |
| 25% | 1.000000 | 1.250000 | 2011.000000 | 5.000000 | 7.000000 | 3.750000 |
| 50% | 3.000000 | 2.080000 | 2011.000000 | 8.000000 | 15.000000 | 9.870000 |
| 75% | 10.000000 | 4.130000 | 2011.000000 | 11.000000 | 22.000000 | 17.400000 |
| max | 80995.000000 | 38970.000000 | 2011.000000 | 12.000000 | 31.000000 | 168469.600000 |
df_up = data.loc[data.UnitPrice <= 0]
df_up['UnitPrice'].value_counts()
0.00 2510 -11062.06 2 Name: UnitPrice, dtype: int64
free_gift = data[data.UnitPrice == 0].groupby('Month').nunique()['InvoiceNo'].sort_values(ascending = False)
free_gift
Month 11 264 4 261 10 238 3 216 12 200 7 185 5 171 9 159 6 151 1 135 8 99 2 76 Name: InvoiceNo, dtype: int64
data_ud = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0)]
data_ud.head()
| InvoiceNo | StockCode | Quantity | UnitPrice | CustomerID | Country | Date | Time | Year | Month | Day | Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | 6 | 2.55 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 15.30 |
| 1 | 536365 | 71053 | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
| 2 | 536365 | 84406B | 8 | 2.75 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 22.00 |
| 3 | 536365 | 84029G | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
| 4 | 536365 | 84029E | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
data_ud.describe().round(2)
| Quantity | UnitPrice | Year | Month | Day | Total | |
|---|---|---|---|---|---|---|
| count | 524876.00 | 524876.00 | 524876.00 | 524876.00 | 524876.00 | 524876.00 |
| mean | 10.62 | 3.92 | 2010.92 | 7.55 | 15.02 | 20.27 |
| std | 156.28 | 36.09 | 0.27 | 3.51 | 8.66 | 271.69 |
| min | 1.00 | 0.00 | 2010.00 | 1.00 | 1.00 | 0.00 |
| 25% | 1.00 | 1.25 | 2011.00 | 5.00 | 7.00 | 3.90 |
| 50% | 4.00 | 2.08 | 2011.00 | 8.00 | 15.00 | 9.92 |
| 75% | 11.00 | 4.13 | 2011.00 | 11.00 | 22.00 | 17.70 |
| max | 80995.00 | 13541.33 | 2011.00 | 12.00 | 31.00 | 168469.60 |
top10_orders = data_ud.groupby('Country').nunique()['InvoiceNo'].sort_values(ascending = False).head(10)
top10_orders
Country United Kingdom 18019 Germany 457 France 392 EIRE 288 Belgium 98 Netherlands 94 Spain 90 Portugal 58 Australia 57 Switzerland 54 Name: InvoiceNo, dtype: int64
country_orders = [go.Bar(x = top10_orders.index.tolist(),
y = top10_orders.values.tolist(),
marker = dict(color = 'purple'))]
layout1 = go.Layout(title = 'Top 10 Countries of Total Number of Orders Placed',
xaxis = dict(title = 'Country'),
yaxis = dict(title = 'Number of Orders'))
figure = go.Figure(data = country_orders, layout = layout1)
pyplot(figure)
top10_qty = data_ud.groupby('Country').sum()['Quantity'].sort_values(ascending = False).head(10)
top10_qty
Country United Kingdom 4646603 Netherlands 200361 EIRE 147007 Germany 119154 France 112060 Australia 83891 Sweden 36078 Switzerland 30617 Spain 27933 Japan 26016 Name: Quantity, dtype: int64
country_qty = [go.Bar(x = top10_qty.index.tolist(),
y = top10_qty.values.tolist(),
marker = dict(color = 'blue'))]
layout2 = go.Layout(title = 'Top 10 Countries of Quantity Sold',
xaxis = dict(title = 'Country'),
yaxis = dict(title = 'Quantity'))
figure = go.Figure(data = country_qty, layout = layout2)
pyplot(figure)
top10_sales = data_ud.groupby('Country').sum()['Total'].sort_values(ascending = False).head(10)
top10_sales
Country United Kingdom 9001192.244 Netherlands 285446.340 EIRE 283140.520 Germany 228678.400 France 209625.370 Australia 138453.810 Spain 61558.560 Switzerland 57067.600 Belgium 41196.340 Sweden 38367.830 Name: Total, dtype: float64
country_sales = [go.Bar(x = top10_sales.index.tolist(),
y = top10_sales.values.tolist(),
marker = dict(color = 'orange'))]
layout3 = go.Layout(title = 'Top 10 Countries of Total Sales',
xaxis = dict(title = 'Country'),
yaxis = dict(title = 'Total Sales'))
figure = go.Figure(data = country_sales, layout = layout3)
pyplot(figure)
top_months_orders = data_ud.groupby(['Year','Month']).nunique()['InvoiceNo'].sort_values(ascending = False)
top_months_orders
Year Month
2011 11 2769
10 2040
9 1837
5 1681
2010 12 1559
2011 6 1533
7 1475
3 1454
8 1361
4 1246
2 1100
1 1086
12 819
Name: InvoiceNo, dtype: int64
sns.set(style = 'darkgrid',
context = 'notebook',
font_scale = 1.2)
dwg1 = top_months_orders.plot(kind = 'bar', figsize = (12,9))
dwg1.set_xlabel('Year, Month', fontsize = 15, labelpad = 20)
dwg1.set_ylabel('Number of Orders', fontsize = 15, labelpad = 20)
dwg1.set_title('Total Number of Orders by Month', fontsize = 20, pad = 30)
plt.xticks(rotation = 45)
plt.show()
top_months_qty = data_ud.groupby(['Year','Month']).sum()['Quantity'].sort_values(ascending = False)
top_months_qty
Year Month
2011 11 751077
10 621029
9 569573
8 421020
7 399693
5 394998
6 388511
1 387099
3 376599
2010 12 358019
2011 12 313612
4 307953
2 282934
Name: Quantity, dtype: int64
sns.set(style = 'darkgrid',
context = 'notebook',
font_scale = 1.2)
dwg2 = top_months_qty.plot(kind = 'bar', figsize = (12,9))
dwg2.set_xlabel('Year, Month', fontsize = 15, labelpad = 20)
dwg2.set_ylabel('Quantity Sold', fontsize = 15, labelpad = 20)
dwg2.set_title('Total Quantity Sold by Months', fontsize = 20, pad = 30)
plt.xticks(rotation = 45)
plt.show()
top_months_sales = data_ud.groupby(['Year','Month']).sum()['Total'].sort_values(ascending = False)
top_months_sales
Year Month
2011 11 1503329.780
10 1151263.730
9 1056435.192
2010 12 821452.730
2011 5 769281.760
6 760547.010
8 757841.380
7 718076.121
3 716215.260
1 689811.610
12 637790.330
4 536968.491
2 522545.560
Name: Total, dtype: float64
sns.set(style = 'darkgrid',
context = 'notebook',
font_scale = 1.2)
dwg3 = top_months_sales.plot(kind = 'bar', figsize = (12,9))
dwg3.set_xlabel('Year, Month', fontsize = 15, labelpad = 20)
dwg3.set_ylabel('Total Sales', fontsize = 15, labelpad = 20)
dwg3.set_title('Total Sales by Months', fontsize = 20, pad = 30)
plt.xticks(rotation = 45)
plt.show()
The average transaction value for each InvoiceNo is 533.14 GBP.
# Calculate the total sales numbers
sum_total = data_ud['Total'].sum()
# Count the total invoice numbers
count_inv = len(data_ud['InvoiceNo'].unique())
# Calculate the average transaction value
avg_price = sum_total / count_inv
avg_price
533.144236172345
# Calculate the number of purchases, total purchase quantity and price for each customer
cust_cba = data_ud.groupby('CustomerID').agg({'InvoiceNo':'nunique',
'Quantity': np.sum,
'Total': np.sum})
cust_cba
| InvoiceNo | Quantity | Total | |
|---|---|---|---|
| CustomerID | |||
| 12346 | 1 | 74215 | 77183.60 |
| 12347 | 7 | 2458 | 4310.00 |
| 12348 | 4 | 2341 | 1797.24 |
| 12349 | 1 | 631 | 1757.55 |
| 12350 | 1 | 197 | 334.40 |
| ... | ... | ... | ... |
| 18281 | 1 | 54 | 80.82 |
| 18282 | 2 | 103 | 178.05 |
| 18283 | 16 | 1357 | 2045.53 |
| 18287 | 3 | 1586 | 1837.28 |
| COD | 1428 | 420418 | 1754901.91 |
4339 rows × 3 columns
# descriptive analysis on customer behavior
cust_cba.describe()
| InvoiceNo | Quantity | Total | |
|---|---|---|---|
| count | 4339.000000 | 4339.000000 | 4.339000e+03 |
| mean | 4.600138 | 1284.193823 | 2.452537e+03 |
| std | 22.943499 | 8119.989390 | 2.808589e+04 |
| min | 1.000000 | 1.000000 | 3.750000e+00 |
| 25% | 1.000000 | 159.000000 | 3.065050e+02 |
| 50% | 2.000000 | 378.000000 | 6.685800e+02 |
| 75% | 5.000000 | 990.500000 | 1.660890e+03 |
| max | 1428.000000 | 420418.000000 | 1.754902e+06 |
data_ud.head()
| InvoiceNo | StockCode | Quantity | UnitPrice | CustomerID | Country | Date | Time | Year | Month | Day | Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | 6 | 2.55 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 15.30 |
| 1 | 536365 | 71053 | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
| 2 | 536365 | 84406B | 8 | 2.75 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 22.00 |
| 3 | 536365 | 84029G | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
| 4 | 536365 | 84029E | 6 | 3.39 | 17850 | United Kingdom | 2010-12-01 | 08:26:00 | 2010 | 12 | 1 | 20.34 |
# Calculate the most recent purchase of each customer
r_date = data_ud.groupby('CustomerID')['Date'].max()
r_date
CustomerID
12346 2011-01-18
12347 2011-12-07
12348 2011-09-25
12349 2011-11-21
12350 2011-02-02
...
18281 2011-06-12
18282 2011-12-02
18283 2011-12-06
18287 2011-10-28
COD 2011-12-09
Name: Date, Length: 4339, dtype: datetime64[ns]
# Calculate the days between the lastest purchase date and set date by customer
set_date = data_ud['Date'].max()
set_date
Timestamp('2011-12-09 00:00:00')
r_value = (set_date - r_date).dt.days.sort_values()
r_value
CustomerID
COD 0
17428 0
17490 0
17581 0
13113 0
...
14237 373
13747 373
18011 373
13065 373
15350 373
Name: Date, Length: 4339, dtype: int64
r_value.describe()
count 4339.000000 mean 92.038258 std 100.010502 min 0.000000 25% 17.000000 50% 50.000000 75% 141.500000 max 373.000000 Name: Date, dtype: float64
sns.set(style = 'darkgrid')
plt.hist(r_value, bins = 30)
plt.show()
f_value = data_ud.groupby('CustomerID')['InvoiceNo'].nunique().sort_values(ascending = False)
f_value
CustomerID
COD 1428
12748 209
14911 201
17841 124
13089 97
...
15314 1
15313 1
15308 1
15307 1
15300 1
Name: InvoiceNo, Length: 4339, dtype: int64
f_value.describe()
count 4339.000000 mean 4.600138 std 22.943499 min 1.000000 25% 1.000000 50% 2.000000 75% 5.000000 max 1428.000000 Name: InvoiceNo, dtype: float64
sns.set(style = 'darkgrid')
plt.hist(f_value[f_value < 40], bins = 40)
plt.show()
The set may contains outlier values, as the standard division is too large, so we limit the m_value to the mean value and draw the figure.
m_value = data_ud.groupby('CustomerID')['Total'].sum().sort_values(ascending = False)
m_value
CustomerID
COD 1754901.91
14646 280206.02
18102 259657.30
17450 194390.79
16446 168472.50
...
16878 13.30
17956 12.75
16454 6.90
14792 6.20
16738 3.75
Name: Total, Length: 4339, dtype: float64
m_value.describe()
count 4.339000e+03 mean 2.452537e+03 std 2.808589e+04 min 3.750000e+00 25% 3.065050e+02 50% 6.685800e+02 75% 1.660890e+03 max 1.754902e+06 Name: Total, dtype: float64
sns.set(style = 'darkgrid')
plt.hist(m_value[m_value < 3000], bins = 30)
plt.show()
# Split customer's most recent purchase to the following time frames in days
r_bins = [0, 30, 90, 180, 360, 720]
# Split customer's purchase frequency into the following segments
f_bins = [1, 2, 5, 10, 20, 2000]
# Split customer's order amount into the following segments
m_bins = [0, 500, 2000, 5000, 10000, 2000000]
# Score customer in 5 grades by order interval, the smaller r_bins get the higher score
r_score = pd.cut(r_value, r_bins, labels = [5, 4, 3, 2, 1], right = False)
# Score customer in 5 grades by order frequency, the larger f_bins get the higher score
f_score = pd.cut(f_value, f_bins, labels = [1, 2, 3, 4, 5], right = False)
# Score customer in 5 grades by order amount, the larger f_bins get the higher score
m_score = pd.cut(m_value, m_bins, labels = [1, 2, 3, 4, 5], right = False)
# Concat rfm scores
rfm = pd.concat([r_score, f_score, m_score], axis = 1)
rfm
| Date | InvoiceNo | Total | |
|---|---|---|---|
| CustomerID | |||
| COD | 5 | 5 | 5 |
| 17428 | 5 | 5 | 5 |
| 17490 | 5 | 3 | 3 |
| 17581 | 5 | 5 | 5 |
| 13113 | 5 | 5 | 5 |
| ... | ... | ... | ... |
| 14237 | 1 | 1 | 1 |
| 13747 | 1 | 1 | 1 |
| 18011 | 1 | 1 | 1 |
| 13065 | 1 | 1 | 1 |
| 15350 | 1 | 1 | 1 |
4339 rows × 3 columns
# Rename columns
rfm.rename(columns = {'Date':'R_score', 'InvoiceNo':'F_score', 'Total': 'M_score'}, inplace = True)
rfm
| R_score | F_score | M_score | |
|---|---|---|---|
| CustomerID | |||
| COD | 5 | 5 | 5 |
| 17428 | 5 | 5 | 5 |
| 17490 | 5 | 3 | 3 |
| 17581 | 5 | 5 | 5 |
| 13113 | 5 | 5 | 5 |
| ... | ... | ... | ... |
| 14237 | 1 | 1 | 1 |
| 13747 | 1 | 1 | 1 |
| 18011 | 1 | 1 | 1 |
| 13065 | 1 | 1 | 1 |
| 15350 | 1 | 1 | 1 |
4339 rows × 3 columns
rfm.info()
<class 'pandas.core.frame.DataFrame'> Index: 4339 entries, COD to 15350 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 R_score 4339 non-null category 1 F_score 4339 non-null category 2 M_score 4339 non-null category dtypes: category(3) memory usage: 47.2+ KB
# Convert data types into operand types
for i in ['R_score', 'F_score', 'M_score']:
rfm[i] = rfm[i].astype(int)
rfm.describe()
| R_score | F_score | M_score | |
|---|---|---|---|
| count | 4339.000000 | 4339.000000 | 4339.000000 |
| mean | 3.821618 | 2.028117 | 1.887762 |
| std | 1.174880 | 0.997989 | 0.954997 |
| min | 1.000000 | 1.000000 | 1.000000 |
| 25% | 3.000000 | 1.000000 | 1.000000 |
| 50% | 4.000000 | 2.000000 | 2.000000 |
| 75% | 5.000000 | 3.000000 | 2.000000 |
| max | 5.000000 | 5.000000 | 5.000000 |
# Rate customer based on the mean value
rfm['R'] = np.where(rfm['R_score'] > 3.821618, 'H', 'L')
rfm['F'] = np.where(rfm['F_score'] > 2.028117, 'H', 'L')
rfm['M'] = np.where(rfm['M_score'] > 1.887762, 'H', 'L')
rfm['Value'] = rfm['R'] + rfm['F'] + rfm['M']
rfm
| R_score | F_score | M_score | R | F | M | Value | |
|---|---|---|---|---|---|---|---|
| CustomerID | |||||||
| COD | 5 | 5 | 5 | H | H | H | HHH |
| 17428 | 5 | 5 | 5 | H | H | H | HHH |
| 17490 | 5 | 3 | 3 | H | H | H | HHH |
| 17581 | 5 | 5 | 5 | H | H | H | HHH |
| 13113 | 5 | 5 | 5 | H | H | H | HHH |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 14237 | 1 | 1 | 1 | L | L | L | LLL |
| 13747 | 1 | 1 | 1 | L | L | L | LLL |
| 18011 | 1 | 1 | 1 | L | L | L | LLL |
| 13065 | 1 | 1 | 1 | L | L | L | LLL |
| 15350 | 1 | 1 | 1 | L | L | L | LLL |
4339 rows × 7 columns
# Create customer segmentations
def trans_value(x):
if x == 'HHH':
return 'Best Customers'
elif x == 'HLH':
return 'Potential Loyalists'
elif x == 'HLL':
return 'New Customers'
elif x == 'HHL':
return 'Promising Customers'
elif x == 'LHH':
return 'Cant Lose Them'
elif x == 'LLH':
return 'At Risk Customers'
elif x == 'LHL':
return 'About to Sleep'
else:
return 'Lost Customers'
rfm['Segmentation'] = rfm['Value'].apply(trans_value)
rfm
| R_score | F_score | M_score | R | F | M | Value | Segmentation | |
|---|---|---|---|---|---|---|---|---|
| CustomerID | ||||||||
| COD | 5 | 5 | 5 | H | H | H | HHH | Best Customers |
| 17428 | 5 | 5 | 5 | H | H | H | HHH | Best Customers |
| 17490 | 5 | 3 | 3 | H | H | H | HHH | Best Customers |
| 17581 | 5 | 5 | 5 | H | H | H | HHH | Best Customers |
| 13113 | 5 | 5 | 5 | H | H | H | HHH | Best Customers |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14237 | 1 | 1 | 1 | L | L | L | LLL | Lost Customers |
| 13747 | 1 | 1 | 1 | L | L | L | LLL | Lost Customers |
| 18011 | 1 | 1 | 1 | L | L | L | LLL | Lost Customers |
| 13065 | 1 | 1 | 1 | L | L | L | LLL | Lost Customers |
| 15350 | 1 | 1 | 1 | L | L | L | LLL | Lost Customers |
4339 rows × 8 columns
rfm_seg = rfm['Segmentation'].value_counts()
# Draw Bar Chart by listing the customer segmentation result
cust_seg_bar = [go.Bar(x = rfm_seg.index,
y = rfm_seg.values,
marker = dict(color = 'purple'))]
layout = go.Layout(title = 'Customer Segmentation Result',
xaxis = dict(title = 'Customer Segmentation'))
cust_seg_layout = go.Figure(data = cust_seg_bar, layout = layout)
pyplot(cust_seg_layout)
# Draw Pie Chart by listing the customer segmentation percentage
cust_seg_pie = [go.Pie(labels = rfm_seg.index,
values = rfm_seg.values,
textfont = dict(size = 15))]
layout = go.Layout(title = 'Customer Segmentation Percentage')
cust_seg_pielayout = go.Figure(data = cust_seg_pie, layout = layout)
pyplot(cust_seg_pielayout)
Besides United Kingdom, Netherlands, NIRE, Germany, France, and Australia are recommended with the most attention and efforts.
23.9% ‘Best Customers’: keeping customer satisfied with the productand services are the top priority. Company can also reward customers with the first experience on new products and promotions. Further analyzing on customer preferences may provide additional opportunities.
23.4% ‘Potential Loyalties’: it is important to offer membership or recommend related products to upsell them and help them become the best customers.
21.3% ‘Lost Customers’: it is a relatively large percentage of customer segments, though it is often challenging to re-engage the lost customers, the high value of these customers makes it worth while trying. It is important to communicate with them on the basis of their specific preference, analyze from earlier transaction data and send email campaigns to recall the brand name.
19.1% ‘New Customer’: it is important to start building relationships with these customers, and keep their purchases valued and satisfied, also company can send special offers to increase their visits.
10.5% ‘At Risk Customers’: it is important to bring them back by sending out relevant promotions, running surveys and providing personalized reactivation campaigns to reconnect and encourage another purchase.